This is me fiddling around with an employee attrition dataset on my last few days at PNC.

Description

Uncover the factors that lead to employee attrition and explore important questions such as ‘show me a breakdown of distance from home by job role and attrition’ or ‘compare average monthly income by education and attrition’. This is a fictional data set created by IBM data scientists.

Education 1 ‘Below College’ 2 ‘College’ 3 ‘Bachelor’ 4 ‘Master’ 5 ‘Doctor’

EnvironmentSatisfaction 1 ‘Low’ 2 ‘Medium’ 3 ‘High’ 4 ‘Very High’

JobInvolvement 1 ‘Low’ 2 ‘Medium’ 3 ‘High’ 4 ‘Very High’

JobSatisfaction 1 ‘Low’ 2 ‘Medium’ 3 ‘High’ 4 ‘Very High’

PerformanceRating 1 ‘Low’ 2 ‘Good’ 3 ‘Excellent’ 4 ‘Outstanding’

RelationshipSatisfaction 1 ‘Low’ 2 ‘Medium’ 3 ‘High’ 4 ‘Very High’

WorkLifeBalance 1 ‘Bad’ 2 ‘Good’ 3 ‘Better’ 4 ‘Best’

Read in Data

library(tidyverse)
library(plotly)
library(readr)
library(devtools)
library(caret)

df <- read_csv(file='ibm_attrition_file.csv')

my_PNC_theme <- theme_bw()
my_PNC_colors = c("#0452cf", "#2c67c7", "#5a83c4", "#b0b0ac", "#e8a061", "#e88631", "#e86e05")
my_PNC_colors_BIG = c("#0080FF","#006DD9", "#005AB2", "#2693FF", "#4CA6FF", "#73B9FF", "#b0b0ac", "#FFB973","#FFA54C","#FF9226", "#D96C00",  "#8C4600", "#B25900",  "#FF7F00")

EDA

Preview Data

First few rows and summarize classification spread

head(df)
## # A tibble: 6 x 35
##     Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education
##   <dbl> <chr>     <chr>              <dbl> <chr>                 <dbl>     <dbl>
## 1    41 Yes       Travel_Rarely       1102 Sales                     1         2
## 2    49 No        Travel_Freque…       279 Research …                8         1
## 3    37 Yes       Travel_Rarely       1373 Research …                2         2
## 4    33 No        Travel_Freque…      1392 Research …                3         4
## 5    27 No        Travel_Rarely        591 Research …                2         1
## 6    32 No        Travel_Freque…      1005 Research …                2         2
## # … with 28 more variables: EducationField <chr>, EmployeeCount <dbl>,
## #   EmployeeNumber <dbl>, EnvironmentSatisfaction <dbl>, Gender <chr>,
## #   HourlyRate <dbl>, JobInvolvement <dbl>, JobLevel <dbl>, JobRole <chr>,
## #   JobSatisfaction <dbl>, MaritalStatus <chr>, MonthlyIncome <dbl>,
## #   MonthlyRate <dbl>, NumCompaniesWorked <dbl>, Over18 <chr>, OverTime <chr>,
## #   PercentSalaryHike <dbl>, PerformanceRating <dbl>,
## #   RelationshipSatisfaction <dbl>, StandardHours <dbl>,
## #   StockOptionLevel <dbl>, TotalWorkingYears <dbl>,
## #   TrainingTimesLastYear <dbl>, WorkLifeBalance <dbl>, YearsAtCompany <dbl>,
## #   YearsInCurrentRole <dbl>, YearsSinceLastPromotion <dbl>,
## #   YearsWithCurrManager <dbl>
nrow(df)
## [1] 1470
plot_ly(alpha = .7) %>%
  add_histogram(
    x = ~df$Attrition,
    name = 'Employee Attrition'
    ) %>%
  layout(
    title = "~15% employee churn in dataset",
    xaxis = list(
      title = 'Attrition',
      zeroline = FALSE
    ),
    yaxis = list(
      title = 'Count',
      zeroline = FALSE
    )
  )
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

Single Variables

Age Distribution

age_dist_plot <- ggplot(data = df, mapping = aes(x = Age)) +
  geom_histogram(fill = my_PNC_colors[2],  
                 line = list(color = "darkgray",
                            width = 5),bins = 30
                 ) +
  labs(title = "Employees distributed by age", 
       x = "Age",
       y = "Count") +
  geom_vline(xintercept = median(df$Age), col = my_PNC_colors[6]) +
  geom_text(x = median(df$Age), y = 4000, label = "50%", col = my_PNC_colors[6], size =  3) +
  geom_vline(xintercept = quantile(df$Age, .25), col = my_PNC_colors[6], linetype = "dashed") +
  geom_text(x = quantile(df$Age, .25), y = -20, label = "25%", col = my_PNC_colors[6], size = 2.5) +
    geom_vline(xintercept = quantile(df$Age, .75), col = my_PNC_colors[6], linetype = "dashed") +
  geom_text(x = quantile(df$Age, .75), y = -20, label = "75%", col = my_PNC_colors[6], size = 2.5) +
  my_PNC_theme
## Warning: Ignoring unknown parameters: line
ggplotly(age_dist_plot)

Attrition rates are higher among younger employees

df_attr <- df %>% filter(Attrition=='Yes')

p_age_dist <- plot_ly(alpha = .7) %>% 
  add_histogram(x = ~df$Age,
                name = "All Employees",
                bingroup = 1) %>% 
  add_histogram(x = ~df_attr$Age,
                name = "Attrited Employees",
                bingroup = 1) %>% 
  layout(barmode = "overlay",
         title = "Employee Attrition Distributed by Age",
         xaxis = list(title = "Age",
                      zeroline = FALSE),
         yaxis = list(title = "Count",
                      zeroline = FALSE))

p_age_dist

Job Satisfaction

High Job Satisfaction still yields attrition but at a lower frequency.

fig <- plot_ly(
  type='histogram',
  x=as.character(df$JobSatisfaction),
  bingroup=1)

fig <- fig %>% add_trace(
  type='histogram',
  x=as.character(df[which(df$Attrition=='Yes'),]$JobSatisfaction),
  bingroup=1)

fig <- fig %>% layout(
  barmode="overlay",
  bargap=0.1)

fig

Gender and Marital Status

Single men and women are more likely to leave the company but age may be a confounder. Men are slightly more likely to leave than women.

df %>% 
  group_by(Gender, MaritalStatus) %>%
  summarize(attr_pct = sum(ifelse(Attrition=='Yes', 1, 0))/n()) -> df_marital
## `summarise()` regrouping output by 'Gender' (override with `.groups` argument)
plot_ly(
  x = df_marital$Gender,
  y = df_marital$MaritalStatus,
  z = df_marital$attr_pct, 
  type='heatmap'
)

Feature Correlation

library(heatmaply)
## Loading required package: viridis
## Loading required package: viridisLite
## 
## ======================
## Welcome to heatmaply version 1.1.1
## 
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
## 
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## Or contact: <tal.galili@gmail.com>
## ======================
library(ggcorrplot)

num_x <- dplyr::select_if(df, is.numeric) %>%
  select(-c(EmployeeCount, EmployeeNumber, StandardHours))
heatmaply_cor(
  cor(num_x),
  xlab = "Features", 
  ylab = "Features",
  k_col = 2, 
  k_row = 2
)
## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
##   dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 6): Library not loaded: /opt/X11/lib/libSM.6.dylib
##   Referenced from: /Library/Frameworks/R.framework/Versions/3.6/Resources/modules/R_X11.so
##   Reason: image not found

Prediction

Split Data

names(df)
##  [1] "Age"                      "Attrition"               
##  [3] "BusinessTravel"           "DailyRate"               
##  [5] "Department"               "DistanceFromHome"        
##  [7] "Education"                "EducationField"          
##  [9] "EmployeeCount"            "EmployeeNumber"          
## [11] "EnvironmentSatisfaction"  "Gender"                  
## [13] "HourlyRate"               "JobInvolvement"          
## [15] "JobLevel"                 "JobRole"                 
## [17] "JobSatisfaction"          "MaritalStatus"           
## [19] "MonthlyIncome"            "MonthlyRate"             
## [21] "NumCompaniesWorked"       "Over18"                  
## [23] "OverTime"                 "PercentSalaryHike"       
## [25] "PerformanceRating"        "RelationshipSatisfaction"
## [27] "StandardHours"            "StockOptionLevel"        
## [29] "TotalWorkingYears"        "TrainingTimesLastYear"   
## [31] "WorkLifeBalance"          "YearsAtCompany"          
## [33] "YearsInCurrentRole"       "YearsSinceLastPromotion" 
## [35] "YearsWithCurrManager"
head(df)
## # A tibble: 6 x 35
##     Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education
##   <dbl> <chr>     <chr>              <dbl> <chr>                 <dbl>     <dbl>
## 1    41 Yes       Travel_Rarely       1102 Sales                     1         2
## 2    49 No        Travel_Freque…       279 Research …                8         1
## 3    37 Yes       Travel_Rarely       1373 Research …                2         2
## 4    33 No        Travel_Freque…      1392 Research …                3         4
## 5    27 No        Travel_Rarely        591 Research …                2         1
## 6    32 No        Travel_Freque…      1005 Research …                2         2
## # … with 28 more variables: EducationField <chr>, EmployeeCount <dbl>,
## #   EmployeeNumber <dbl>, EnvironmentSatisfaction <dbl>, Gender <chr>,
## #   HourlyRate <dbl>, JobInvolvement <dbl>, JobLevel <dbl>, JobRole <chr>,
## #   JobSatisfaction <dbl>, MaritalStatus <chr>, MonthlyIncome <dbl>,
## #   MonthlyRate <dbl>, NumCompaniesWorked <dbl>, Over18 <chr>, OverTime <chr>,
## #   PercentSalaryHike <dbl>, PerformanceRating <dbl>,
## #   RelationshipSatisfaction <dbl>, StandardHours <dbl>,
## #   StockOptionLevel <dbl>, TotalWorkingYears <dbl>,
## #   TrainingTimesLastYear <dbl>, WorkLifeBalance <dbl>, YearsAtCompany <dbl>,
## #   YearsInCurrentRole <dbl>, YearsSinceLastPromotion <dbl>,
## #   YearsWithCurrManager <dbl>
library(dplyr)
df %>% 
  dplyr::select(Attrition, Age, BusinessTravel, DailyRate, Department, DistanceFromHome, Education, EnvironmentSatisfaction, 
                Gender, HourlyRate, JobInvolvement,JobLevel, JobRole, JobSatisfaction, MaritalStatus, MonthlyIncome, MonthlyRate,
                NumCompaniesWorked, OverTime, PercentSalaryHike, PerformanceRating, RelationshipSatisfaction, StockOptionLevel,
                TotalWorkingYears, TrainingTimesLastYear, WorkLifeBalance, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion,
                YearsWithCurrManager) -> df_pred

## 75% of the sample size
smp_size <- floor(0.75 * nrow(df_pred))

## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(df_pred)), size = smp_size)

train <- df_pred[train_ind, ]
test <- df_pred[-train_ind, ]

test %>% group_by(Attrition) %>% tally()
## # A tibble: 2 x 2
##   Attrition     n
##   <chr>     <int>
## 1 No          302
## 2 Yes          66
train %>% group_by(Attrition) %>% tally()
## # A tibble: 2 x 2
##   Attrition     n
##   <chr>     <int>
## 1 No          931
## 2 Yes         171

Linear Discriminant Analysis

Linear Discriminant Analysis (LDA) is a feature reduction method for data with discrete classes. It is like PCA except that it takes advantage of information about the classification in the training data. It projects data into fewer dimensions by maximizing both the mean distance between the median data point of each class and minimizing the “spread” within each class.

Normalize Data

library(MASS)
library(caret)

# Estimate preprocessing parameters
preproc.param <- train %>% 
  preProcess(method = c("center", "scale"))
# Transform the data using the estimated parameters
train.transformed <- preproc.param %>% predict(train)
test.transformed <- preproc.param %>% predict(test)

Fit LDA formula to reduce to 1 dimension

fit <- lda(Attrition ~ .,  data=train.transformed)
# Make predictions
predictions <- fit %>% predict(test.transformed)
# Model accuracy
mean(predictions$class==test.transformed$Attrition)
## [1] 0.8695652
fit
## Call:
## lda(Attrition ~ ., data = train.transformed)
## 
## Prior probabilities of groups:
##        No       Yes 
## 0.8448276 0.1551724 
## 
## Group means:
##             Age BusinessTravelTravel_Frequently BusinessTravelTravel_Rarely
## No   0.08403305                       0.1729323                   0.7142857
## Yes -0.45751327                       0.2982456                   0.6491228
##       DailyRate DepartmentResearch & Development DepartmentSales
## No   0.02116198                        0.6745435       0.2857143
## Yes -0.11521522                        0.5730994       0.3742690
##     DistanceFromHome   Education EnvironmentSatisfaction GenderMale
## No        -0.0229024  0.01866583              0.04521538  0.6165414
## Yes        0.1246908 -0.10162506             -0.24617262  0.6140351
##       HourlyRate JobInvolvement    JobLevel JobRoleHuman Resources
## No   0.004268794     0.05758122  0.07923603             0.02900107
## Yes -0.023241212    -0.31349774 -0.43139617             0.05263158
##     JobRoleLaboratory Technician JobRoleManager JobRoleManufacturing Director
## No                     0.1578947     0.08485499                    0.11278195
## Yes                    0.2631579     0.01754386                    0.05847953
##     JobRoleResearch Director JobRoleResearch Scientist JobRoleSales Executive
## No               0.065520945                 0.1965628              0.2126745
## Yes              0.005847953                 0.1988304              0.2105263
##     JobRoleSales Representative JobSatisfaction MaritalStatusMarried
## No                   0.04189044      0.04304638            0.4908700
## Yes                  0.15204678     -0.23436360            0.3567251
##     MaritalStatusSingle MonthlyIncome  MonthlyRate NumCompaniesWorked
## No            0.2717508    0.07725473  0.004449091       -0.008251001
## Yes           0.5087719   -0.42060909 -0.024222828        0.044922115
##     OverTimeYes PercentSalaryHike PerformanceRating RelationshipSatisfaction
## No    0.2352309       0.009277256      0.0009095309               0.01043468
## Yes   0.5146199      -0.050509507     -0.0049518905              -0.05681105
##     StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance
## No        0.06013557        0.07774277             0.0242523      0.02494059
## Yes      -0.32740475       -0.42326618            -0.1320403     -0.13578767
##     YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion
## No      0.05923639         0.06950496              0.02213622
## Yes    -0.32250923        -0.37841588             -0.12051942
##     YearsWithCurrManager
## No            0.06144507
## Yes          -0.33453425
## 
## Coefficients of linear discriminants:
##                                          LD1
## Age                              -0.31382997
## BusinessTravelTravel_Frequently   0.85006785
## BusinessTravelTravel_Rarely       0.37271972
## DailyRate                        -0.06225762
## DepartmentResearch & Development  0.16675536
## DepartmentSales                   0.39728225
## DistanceFromHome                  0.14502871
## Education                         0.02819707
## EnvironmentSatisfaction          -0.26383015
## GenderMale                        0.07614074
## HourlyRate                       -0.02626801
## JobInvolvement                   -0.28249441
## JobLevel                          0.01286254
## JobRoleHuman Resources            1.01170545
## JobRoleLaboratory Technician      0.77204358
## JobRoleManager                    0.16090277
## JobRoleManufacturing Director     0.16169254
## JobRoleResearch Director          0.03828262
## JobRoleResearch Scientist         0.19140001
## JobRoleSales Executive            0.22250585
## JobRoleSales Representative       1.38783855
## JobSatisfaction                  -0.26522562
## MaritalStatusMarried              0.06232714
## MaritalStatusSingle               0.68210944
## MonthlyIncome                    -0.08575937
## MonthlyRate                      -0.01138975
## NumCompaniesWorked                0.21414690
## OverTimeYes                       1.21323465
## PercentSalaryHike                -0.10151441
## PerformanceRating                 0.07169970
## RelationshipSatisfaction         -0.11233641
## StockOptionLevel                 -0.09672352
## TotalWorkingYears                -0.03993764
## TrainingTimesLastYear            -0.13114119
## WorkLifeBalance                  -0.14639612
## YearsAtCompany                    0.12803471
## YearsInCurrentRole               -0.21860306
## YearsSinceLastPromotion           0.19916812
## YearsWithCurrManager             -0.12850848
ct <- table(test.transformed$Attrition, predictions$class)
ct
##      
##        No Yes
##   No  294   8
##   Yes  40  26
diag(prop.table(ct, 1))
##        No       Yes 
## 0.9735099 0.3939394
# # total percent correct

Plot Fit

# Predicted classes
head(predictions$class, 6)
## [1] Yes No  Yes No  Yes No 
## Levels: No Yes
# Predicted probabilities of class memebership.
head(predictions$posterior, 6) 
##          No         Yes
## 1 0.4137657 0.586234260
## 2 0.9555013 0.044498702
## 3 0.1516319 0.848368093
## 4 0.9911454 0.008854612
## 5 0.2984373 0.701562672
## 6 0.9537101 0.046289939
# Linear discriminants
head(predictions$x, 3) 
##          LD1
## 1  1.8708056
## 2 -0.3830858
## 3  2.7772201
lda.data <- cbind(train.transformed, predict(fit)$x)

plot(fit)

ggplot(lda.data, aes(LD1, y=LD1+rnorm(nrow(lda.data)))) +
  geom_point(aes(color = Attrition))